{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n",
    "import xgboost\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import roc_auc_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.width', 1000)\n",
    "pd.set_option('display.float_format', '{:.0f}'.format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "base_dir = \"D:\\wk\\myProject\\project_insureFraudPredict\\data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv(f\"{base_dir}\\\\train.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>policy_id</th>\n",
       "      <th>age</th>\n",
       "      <th>customer_months</th>\n",
       "      <th>policy_bind_date</th>\n",
       "      <th>policy_state</th>\n",
       "      <th>policy_csl</th>\n",
       "      <th>policy_deductable</th>\n",
       "      <th>policy_annual_premium</th>\n",
       "      <th>umbrella_limit</th>\n",
       "      <th>insured_zip</th>\n",
       "      <th>insured_sex</th>\n",
       "      <th>insured_education_level</th>\n",
       "      <th>insured_occupation</th>\n",
       "      <th>insured_hobbies</th>\n",
       "      <th>insured_relationship</th>\n",
       "      <th>capital-gains</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>incident_date</th>\n",
       "      <th>incident_type</th>\n",
       "      <th>collision_type</th>\n",
       "      <th>incident_severity</th>\n",
       "      <th>authorities_contacted</th>\n",
       "      <th>incident_state</th>\n",
       "      <th>incident_city</th>\n",
       "      <th>incident_hour_of_the_day</th>\n",
       "      <th>number_of_vehicles_involved</th>\n",
       "      <th>property_damage</th>\n",
       "      <th>bodily_injuries</th>\n",
       "      <th>witnesses</th>\n",
       "      <th>police_report_available</th>\n",
       "      <th>total_claim_amount</th>\n",
       "      <th>injury_claim</th>\n",
       "      <th>property_claim</th>\n",
       "      <th>vehicle_claim</th>\n",
       "      <th>auto_make</th>\n",
       "      <th>auto_model</th>\n",
       "      <th>auto_year</th>\n",
       "      <th>fraud</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>122576</td>\n",
       "      <td>37</td>\n",
       "      <td>189</td>\n",
       "      <td>2013-08-21</td>\n",
       "      <td>C</td>\n",
       "      <td>500/1000</td>\n",
       "      <td>1000</td>\n",
       "      <td>1466</td>\n",
       "      <td>5000000</td>\n",
       "      <td>455456</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>Masters</td>\n",
       "      <td>protective-serv</td>\n",
       "      <td>reading</td>\n",
       "      <td>not-in-family</td>\n",
       "      <td>62203</td>\n",
       "      <td>0</td>\n",
       "      <td>2014-12-22</td>\n",
       "      <td>Single Vehicle Collision</td>\n",
       "      <td>Side Collision</td>\n",
       "      <td>Total Loss</td>\n",
       "      <td>Ambulance</td>\n",
       "      <td>S5</td>\n",
       "      <td>Riverwood</td>\n",
       "      <td>21</td>\n",
       "      <td>1</td>\n",
       "      <td>?</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>?</td>\n",
       "      <td>54930</td>\n",
       "      <td>6029</td>\n",
       "      <td>5752</td>\n",
       "      <td>44452</td>\n",
       "      <td>Nissan</td>\n",
       "      <td>Maxima</td>\n",
       "      <td>2000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>937713</td>\n",
       "      <td>44</td>\n",
       "      <td>234</td>\n",
       "      <td>1998-01-04</td>\n",
       "      <td>B</td>\n",
       "      <td>250/500</td>\n",
       "      <td>500</td>\n",
       "      <td>821</td>\n",
       "      <td>0</td>\n",
       "      <td>591805</td>\n",
       "      <td>MALE</td>\n",
       "      <td>JD</td>\n",
       "      <td>craft-repair</td>\n",
       "      <td>polo</td>\n",
       "      <td>other-relative</td>\n",
       "      <td>31606</td>\n",
       "      <td>0</td>\n",
       "      <td>2015-02-18</td>\n",
       "      <td>Multi-vehicle Collision</td>\n",
       "      <td>Side Collision</td>\n",
       "      <td>Minor Damage</td>\n",
       "      <td>Other</td>\n",
       "      <td>S5</td>\n",
       "      <td>Springfield</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>?</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>YES</td>\n",
       "      <td>50680</td>\n",
       "      <td>5376</td>\n",
       "      <td>10156</td>\n",
       "      <td>37347</td>\n",
       "      <td>Honda</td>\n",
       "      <td>Civic</td>\n",
       "      <td>1996</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>680237</td>\n",
       "      <td>33</td>\n",
       "      <td>23</td>\n",
       "      <td>1996-02-06</td>\n",
       "      <td>B</td>\n",
       "      <td>500/1000</td>\n",
       "      <td>1000</td>\n",
       "      <td>1844</td>\n",
       "      <td>0</td>\n",
       "      <td>442490</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>High School</td>\n",
       "      <td>machine-op-inspct</td>\n",
       "      <td>skydiving</td>\n",
       "      <td>wife</td>\n",
       "      <td>0</td>\n",
       "      <td>-43166</td>\n",
       "      <td>2015-01-18</td>\n",
       "      <td>Single Vehicle Collision</td>\n",
       "      <td>Side Collision</td>\n",
       "      <td>Total Loss</td>\n",
       "      <td>Police</td>\n",
       "      <td>S3</td>\n",
       "      <td>Northbend</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>?</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>NO</td>\n",
       "      <td>47829</td>\n",
       "      <td>4460</td>\n",
       "      <td>9247</td>\n",
       "      <td>33644</td>\n",
       "      <td>Jeep</td>\n",
       "      <td>Wrangler</td>\n",
       "      <td>2002</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>513080</td>\n",
       "      <td>42</td>\n",
       "      <td>210</td>\n",
       "      <td>2008-11-14</td>\n",
       "      <td>A</td>\n",
       "      <td>500/1000</td>\n",
       "      <td>500</td>\n",
       "      <td>1867</td>\n",
       "      <td>0</td>\n",
       "      <td>439408</td>\n",
       "      <td>MALE</td>\n",
       "      <td>JD</td>\n",
       "      <td>transport-moving</td>\n",
       "      <td>video-games</td>\n",
       "      <td>own-child</td>\n",
       "      <td>0</td>\n",
       "      <td>-49440</td>\n",
       "      <td>2015-02-02</td>\n",
       "      <td>Multi-vehicle Collision</td>\n",
       "      <td>Front Collision</td>\n",
       "      <td>Major Damage</td>\n",
       "      <td>Fire</td>\n",
       "      <td>S3</td>\n",
       "      <td>Northbend</td>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "      <td>YES</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>YES</td>\n",
       "      <td>68862</td>\n",
       "      <td>11043</td>\n",
       "      <td>5955</td>\n",
       "      <td>53548</td>\n",
       "      <td>Suburu</td>\n",
       "      <td>Legacy</td>\n",
       "      <td>2003</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>192875</td>\n",
       "      <td>29</td>\n",
       "      <td>81</td>\n",
       "      <td>2002-01-08</td>\n",
       "      <td>A</td>\n",
       "      <td>100/300</td>\n",
       "      <td>1000</td>\n",
       "      <td>816</td>\n",
       "      <td>0</td>\n",
       "      <td>640575</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>MD</td>\n",
       "      <td>craft-repair</td>\n",
       "      <td>video-games</td>\n",
       "      <td>own-child</td>\n",
       "      <td>75296</td>\n",
       "      <td>-73689</td>\n",
       "      <td>2015-02-09</td>\n",
       "      <td>Multi-vehicle Collision</td>\n",
       "      <td>Rear Collision</td>\n",
       "      <td>Total Loss</td>\n",
       "      <td>Fire</td>\n",
       "      <td>S2</td>\n",
       "      <td>Northbend</td>\n",
       "      <td>9</td>\n",
       "      <td>3</td>\n",
       "      <td>YES</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>YES</td>\n",
       "      <td>59726</td>\n",
       "      <td>5617</td>\n",
       "      <td>10301</td>\n",
       "      <td>41550</td>\n",
       "      <td>Ford</td>\n",
       "      <td>F150</td>\n",
       "      <td>2004</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>695</th>\n",
       "      <td>1008425</td>\n",
       "      <td>37</td>\n",
       "      <td>196</td>\n",
       "      <td>1997-06-29</td>\n",
       "      <td>C</td>\n",
       "      <td>250/500</td>\n",
       "      <td>500</td>\n",
       "      <td>1301</td>\n",
       "      <td>0</td>\n",
       "      <td>474615</td>\n",
       "      <td>MALE</td>\n",
       "      <td>JD</td>\n",
       "      <td>tech-support</td>\n",
       "      <td>video-games</td>\n",
       "      <td>wife</td>\n",
       "      <td>47627</td>\n",
       "      <td>0</td>\n",
       "      <td>2015-01-18</td>\n",
       "      <td>Single Vehicle Collision</td>\n",
       "      <td>Front Collision</td>\n",
       "      <td>Major Damage</td>\n",
       "      <td>Ambulance</td>\n",
       "      <td>S5</td>\n",
       "      <td>Columbus</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>?</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>NO</td>\n",
       "      <td>61433</td>\n",
       "      <td>10436</td>\n",
       "      <td>11432</td>\n",
       "      <td>39745</td>\n",
       "      <td>Nissan</td>\n",
       "      <td>Pathfinder</td>\n",
       "      <td>2011</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>696</th>\n",
       "      <td>770702</td>\n",
       "      <td>43</td>\n",
       "      <td>229</td>\n",
       "      <td>2001-05-29</td>\n",
       "      <td>A</td>\n",
       "      <td>250/500</td>\n",
       "      <td>500</td>\n",
       "      <td>1435</td>\n",
       "      <td>8000000</td>\n",
       "      <td>444476</td>\n",
       "      <td>MALE</td>\n",
       "      <td>College</td>\n",
       "      <td>machine-op-inspct</td>\n",
       "      <td>golf</td>\n",
       "      <td>husband</td>\n",
       "      <td>0</td>\n",
       "      <td>-32289</td>\n",
       "      <td>2015-01-13</td>\n",
       "      <td>Multi-vehicle Collision</td>\n",
       "      <td>Rear Collision</td>\n",
       "      <td>Major Damage</td>\n",
       "      <td>Ambulance</td>\n",
       "      <td>S1</td>\n",
       "      <td>Arlington</td>\n",
       "      <td>17</td>\n",
       "      <td>3</td>\n",
       "      <td>NO</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>?</td>\n",
       "      <td>68623</td>\n",
       "      <td>6798</td>\n",
       "      <td>14557</td>\n",
       "      <td>50606</td>\n",
       "      <td>Volkswagen</td>\n",
       "      <td>Passat</td>\n",
       "      <td>2013</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>697</th>\n",
       "      <td>755099</td>\n",
       "      <td>35</td>\n",
       "      <td>209</td>\n",
       "      <td>2003-01-11</td>\n",
       "      <td>C</td>\n",
       "      <td>100/300</td>\n",
       "      <td>500</td>\n",
       "      <td>1639</td>\n",
       "      <td>0</td>\n",
       "      <td>639608</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>College</td>\n",
       "      <td>transport-moving</td>\n",
       "      <td>golf</td>\n",
       "      <td>not-in-family</td>\n",
       "      <td>0</td>\n",
       "      <td>-40797</td>\n",
       "      <td>2015-03-05</td>\n",
       "      <td>Multi-vehicle Collision</td>\n",
       "      <td>Rear Collision</td>\n",
       "      <td>Minor Damage</td>\n",
       "      <td>Fire</td>\n",
       "      <td>S2</td>\n",
       "      <td>Riverwood</td>\n",
       "      <td>7</td>\n",
       "      <td>3</td>\n",
       "      <td>NO</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>YES</td>\n",
       "      <td>58033</td>\n",
       "      <td>9129</td>\n",
       "      <td>4598</td>\n",
       "      <td>40740</td>\n",
       "      <td>Mercedes</td>\n",
       "      <td>C300</td>\n",
       "      <td>2002</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>698</th>\n",
       "      <td>693804</td>\n",
       "      <td>44</td>\n",
       "      <td>275</td>\n",
       "      <td>2003-07-22</td>\n",
       "      <td>B</td>\n",
       "      <td>500/1000</td>\n",
       "      <td>2000</td>\n",
       "      <td>1042</td>\n",
       "      <td>0</td>\n",
       "      <td>432061</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>Associate</td>\n",
       "      <td>machine-op-inspct</td>\n",
       "      <td>paintball</td>\n",
       "      <td>other-relative</td>\n",
       "      <td>46822</td>\n",
       "      <td>0</td>\n",
       "      <td>2015-01-09</td>\n",
       "      <td>Multi-vehicle Collision</td>\n",
       "      <td>Rear Collision</td>\n",
       "      <td>Major Damage</td>\n",
       "      <td>Ambulance</td>\n",
       "      <td>S5</td>\n",
       "      <td>Northbend</td>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "      <td>?</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>NO</td>\n",
       "      <td>35253</td>\n",
       "      <td>7359</td>\n",
       "      <td>3464</td>\n",
       "      <td>24677</td>\n",
       "      <td>Audi</td>\n",
       "      <td>A3</td>\n",
       "      <td>2007</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>699</th>\n",
       "      <td>598086</td>\n",
       "      <td>47</td>\n",
       "      <td>263</td>\n",
       "      <td>1996-08-15</td>\n",
       "      <td>C</td>\n",
       "      <td>500/1000</td>\n",
       "      <td>500</td>\n",
       "      <td>1283</td>\n",
       "      <td>0</td>\n",
       "      <td>433809</td>\n",
       "      <td>FEMALE</td>\n",
       "      <td>High School</td>\n",
       "      <td>machine-op-inspct</td>\n",
       "      <td>sleeping</td>\n",
       "      <td>wife</td>\n",
       "      <td>54087</td>\n",
       "      <td>-61343</td>\n",
       "      <td>2015-01-08</td>\n",
       "      <td>Multi-vehicle Collision</td>\n",
       "      <td>Side Collision</td>\n",
       "      <td>Total Loss</td>\n",
       "      <td>Police</td>\n",
       "      <td>S4</td>\n",
       "      <td>Hillsdale</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>?</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NO</td>\n",
       "      <td>24320</td>\n",
       "      <td>2250</td>\n",
       "      <td>4285</td>\n",
       "      <td>18092</td>\n",
       "      <td>Suburu</td>\n",
       "      <td>Forrestor</td>\n",
       "      <td>2008</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>700 rows × 38 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     policy_id  age  customer_months policy_bind_date policy_state policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  insured_zip insured_sex insured_education_level insured_occupation insured_hobbies insured_relationship  capital-gains  capital-loss incident_date             incident_type   collision_type incident_severity authorities_contacted incident_state incident_city  incident_hour_of_the_day  number_of_vehicles_involved property_damage  bodily_injuries  witnesses police_report_available  total_claim_amount  injury_claim  property_claim  vehicle_claim   auto_make  auto_model  auto_year  fraud\n",
       "0       122576   37              189       2013-08-21            C   500/1000               1000                   1466         5000000       455456      FEMALE                 Masters    protective-serv         reading        not-in-family          62203             0    2014-12-22  Single Vehicle Collision   Side Collision        Total Loss             Ambulance             S5     Riverwood                        21                            1               ?                0          3                       ?               54930          6029            5752          44452      Nissan      Maxima       2000      0\n",
       "1       937713   44              234       1998-01-04            B    250/500                500                    821               0       591805        MALE                      JD       craft-repair            polo       other-relative          31606             0    2015-02-18   Multi-vehicle Collision   Side Collision      Minor Damage                 Other             S5   Springfield                         4                            3               ?                2          1                     YES               50680          5376           10156          37347       Honda       Civic       1996      0\n",
       "2       680237   33               23       1996-02-06            B   500/1000               1000                   1844               0       442490      FEMALE             High School  machine-op-inspct       skydiving                 wife              0        -43166    2015-01-18  Single Vehicle Collision   Side Collision        Total Loss                Police             S3     Northbend                         0                            1               ?                2          1                      NO               47829          4460            9247          33644        Jeep    Wrangler       2002      0\n",
       "3       513080   42              210       2008-11-14            A   500/1000                500                   1867               0       439408        MALE                      JD   transport-moving     video-games            own-child              0        -49440    2015-02-02   Multi-vehicle Collision  Front Collision      Major Damage                  Fire             S3     Northbend                        20                            3             YES                2          2                     YES               68862         11043            5955          53548      Suburu      Legacy       2003      1\n",
       "4       192875   29               81       2002-01-08            A    100/300               1000                    816               0       640575      FEMALE                      MD       craft-repair     video-games            own-child          75296        -73689    2015-02-09   Multi-vehicle Collision   Rear Collision        Total Loss                  Fire             S2     Northbend                         9                            3             YES                2          1                     YES               59726          5617           10301          41550        Ford        F150       2004      0\n",
       "..         ...  ...              ...              ...          ...        ...                ...                    ...             ...          ...         ...                     ...                ...             ...                  ...            ...           ...           ...                       ...              ...               ...                   ...            ...           ...                       ...                          ...             ...              ...        ...                     ...                 ...           ...             ...            ...         ...         ...        ...    ...\n",
       "695    1008425   37              196       1997-06-29            C    250/500                500                   1301               0       474615        MALE                      JD       tech-support     video-games                 wife          47627             0    2015-01-18  Single Vehicle Collision  Front Collision      Major Damage             Ambulance             S5      Columbus                         4                            1               ?                0          3                      NO               61433         10436           11432          39745      Nissan  Pathfinder       2011      1\n",
       "696     770702   43              229       2001-05-29            A    250/500                500                   1435         8000000       444476        MALE                 College  machine-op-inspct            golf              husband              0        -32289    2015-01-13   Multi-vehicle Collision   Rear Collision      Major Damage             Ambulance             S1     Arlington                        17                            3              NO                0          1                       ?               68623          6798           14557          50606  Volkswagen      Passat       2013      1\n",
       "697     755099   35              209       2003-01-11            C    100/300                500                   1639               0       639608      FEMALE                 College   transport-moving            golf        not-in-family              0        -40797    2015-03-05   Multi-vehicle Collision   Rear Collision      Minor Damage                  Fire             S2     Riverwood                         7                            3              NO                2          0                     YES               58033          9129            4598          40740    Mercedes        C300       2002      0\n",
       "698     693804   44              275       2003-07-22            B   500/1000               2000                   1042               0       432061      FEMALE               Associate  machine-op-inspct       paintball       other-relative          46822             0    2015-01-09   Multi-vehicle Collision   Rear Collision      Major Damage             Ambulance             S5     Northbend                        20                            3               ?                1          0                      NO               35253          7359            3464          24677        Audi          A3       2007      1\n",
       "699     598086   47              263       1996-08-15            C   500/1000                500                   1283               0       433809      FEMALE             High School  machine-op-inspct        sleeping                 wife          54087        -61343    2015-01-08   Multi-vehicle Collision   Side Collision        Total Loss                Police             S4     Hillsdale                         5                            3               ?                0          0                      NO               24320          2250            4285          18092      Suburu   Forrestor       2008      0\n",
       "\n",
       "[700 rows x 38 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------------整体概况----------------\n",
      "                                type  row_nums  null_nums  mean_num  std_num  nunique_nums\n",
      "policy_id                      int64       700          0    549625   259068           700\n",
      "age                            int64       700          0        39        9            45\n",
      "customer_months                int64       700          0       205      116           332\n",
      "policy_bind_date              object       700          0       NaN      NaN           674\n",
      "policy_state                  object       700          0       NaN      NaN             3\n",
      "policy_csl                    object       700          0       NaN      NaN             3\n",
      "policy_deductable              int64       700          0      1148      612             3\n",
      "policy_annual_premium        float64       700          0      1247      251           698\n",
      "umbrella_limit                 int64       700          0   1100000  2282922            11\n",
      "insured_zip                    int64       700          0    502797    74251           699\n",
      "insured_sex                   object       700          0       NaN      NaN             2\n",
      "insured_education_level       object       700          0       NaN      NaN             7\n",
      "insured_occupation            object       700          0       NaN      NaN            14\n",
      "insured_hobbies               object       700          0       NaN      NaN            20\n",
      "insured_relationship          object       700          0       NaN      NaN             6\n",
      "capital-gains                  int64       700          0     25842    28108           352\n",
      "capital-loss                   int64       700          0    -26247    28465           357\n",
      "incident_date                 object       700          0       NaN      NaN           112\n",
      "incident_type                 object       700          0       NaN      NaN             4\n",
      "collision_type                object       700          0       NaN      NaN             4\n",
      "incident_severity             object       700          0       NaN      NaN             4\n",
      "authorities_contacted         object       700         60       NaN      NaN             4\n",
      "incident_state                object       700          0       NaN      NaN             7\n",
      "incident_city                 object       700          0       NaN      NaN             7\n",
      "incident_hour_of_the_day       int64       700          0        12        7            24\n",
      "number_of_vehicles_involved    int64       700          0         2        1             4\n",
      "property_damage               object       700          0       NaN      NaN             3\n",
      "bodily_injuries                int64       700          0         1        1             3\n",
      "witnesses                      int64       700          0         1        1             4\n",
      "police_report_available       object       700          0       NaN      NaN             3\n",
      "total_claim_amount             int64       700          0     52423    26179           694\n",
      "injury_claim                   int64       700          0      7450     4889           665\n",
      "property_claim                 int64       700          0      7332     4787           662\n",
      "vehicle_claim                  int64       700          0     37688    18724           697\n",
      "auto_make                     object       700          0       NaN      NaN            14\n",
      "auto_model                    object       700          0       NaN      NaN            39\n",
      "auto_year                      int64       700          0      2005        6            21\n",
      "fraud                          int64       700          0         0        0             2\n",
      "----------------整体概况----------------\n"
     ]
    },
    {
     "ename": "PermissionError",
     "evalue": "[Errno 13] Permission denied: 'D:\\\\wk\\\\myProject\\\\project_insureFraudPredict\\\\data\\\\overview.xlsx'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mPermissionError\u001b[0m                           Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[6], line 24\u001b[0m\n\u001b[0;32m     21\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m overview\n\u001b[0;32m     23\u001b[0m overView \u001b[38;5;241m=\u001b[39m overViewAnalysis(train)\n\u001b[1;32m---> 24\u001b[0m \u001b[43moverView\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_excel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mbase_dir\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[38;5;130;43;01m\\\\\u001b[39;49;00m\u001b[38;5;124;43moverview.xlsx\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mopenpyxl\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mC:\\Program Files\\MySoft\\anaconda3\\Anaconda3_20240312\\envs\\dataMining_insuranceAntiFraud\\lib\\site-packages\\pandas\\util\\_decorators.py:333\u001b[0m, in \u001b[0;36mdeprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    327\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(args) \u001b[38;5;241m>\u001b[39m num_allow_args:\n\u001b[0;32m    328\u001b[0m     warnings\u001b[38;5;241m.\u001b[39mwarn(\n\u001b[0;32m    329\u001b[0m         msg\u001b[38;5;241m.\u001b[39mformat(arguments\u001b[38;5;241m=\u001b[39m_format_argument_list(allow_args)),\n\u001b[0;32m    330\u001b[0m         \u001b[38;5;167;01mFutureWarning\u001b[39;00m,\n\u001b[0;32m    331\u001b[0m         stacklevel\u001b[38;5;241m=\u001b[39mfind_stack_level(),\n\u001b[0;32m    332\u001b[0m     )\n\u001b[1;32m--> 333\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
      "File \u001b[1;32mC:\\Program Files\\MySoft\\anaconda3\\Anaconda3_20240312\\envs\\dataMining_insuranceAntiFraud\\lib\\site-packages\\pandas\\core\\generic.py:2417\u001b[0m, in \u001b[0;36mNDFrame.to_excel\u001b[1;34m(self, excel_writer, sheet_name, na_rep, float_format, columns, header, index, index_label, startrow, startcol, engine, merge_cells, inf_rep, freeze_panes, storage_options, engine_kwargs)\u001b[0m\n\u001b[0;32m   2404\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mio\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mformats\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexcel\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m ExcelFormatter\n\u001b[0;32m   2406\u001b[0m formatter \u001b[38;5;241m=\u001b[39m ExcelFormatter(\n\u001b[0;32m   2407\u001b[0m     df,\n\u001b[0;32m   2408\u001b[0m     na_rep\u001b[38;5;241m=\u001b[39mna_rep,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m   2415\u001b[0m     inf_rep\u001b[38;5;241m=\u001b[39minf_rep,\n\u001b[0;32m   2416\u001b[0m )\n\u001b[1;32m-> 2417\u001b[0m \u001b[43mformatter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwrite\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   2418\u001b[0m \u001b[43m    \u001b[49m\u001b[43mexcel_writer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2419\u001b[0m \u001b[43m    \u001b[49m\u001b[43msheet_name\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msheet_name\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2420\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstartrow\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstartrow\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2421\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstartcol\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstartcol\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2422\u001b[0m \u001b[43m    \u001b[49m\u001b[43mfreeze_panes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfreeze_panes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2423\u001b[0m \u001b[43m    \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2424\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2425\u001b[0m \u001b[43m    \u001b[49m\u001b[43mengine_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m   2426\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[1;32mC:\\Program Files\\MySoft\\anaconda3\\Anaconda3_20240312\\envs\\dataMining_insuranceAntiFraud\\lib\\site-packages\\pandas\\io\\formats\\excel.py:943\u001b[0m, in \u001b[0;36mExcelFormatter.write\u001b[1;34m(self, writer, sheet_name, startrow, startcol, freeze_panes, engine, storage_options, engine_kwargs)\u001b[0m\n\u001b[0;32m    941\u001b[0m     need_save \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[0;32m    942\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m--> 943\u001b[0m     writer \u001b[38;5;241m=\u001b[39m \u001b[43mExcelWriter\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m    944\u001b[0m \u001b[43m        \u001b[49m\u001b[43mwriter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    945\u001b[0m \u001b[43m        \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    946\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    947\u001b[0m \u001b[43m        \u001b[49m\u001b[43mengine_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m    948\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    949\u001b[0m     need_save \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m    951\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
      "File \u001b[1;32mC:\\Program Files\\MySoft\\anaconda3\\Anaconda3_20240312\\envs\\dataMining_insuranceAntiFraud\\lib\\site-packages\\pandas\\io\\excel\\_openpyxl.py:61\u001b[0m, in \u001b[0;36mOpenpyxlWriter.__init__\u001b[1;34m(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs, **kwargs)\u001b[0m\n\u001b[0;32m     57\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mopenpyxl\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mworkbook\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m Workbook\n\u001b[0;32m     59\u001b[0m engine_kwargs \u001b[38;5;241m=\u001b[39m combine_kwargs(engine_kwargs, kwargs)\n\u001b[1;32m---> 61\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[0;32m     62\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     63\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     64\u001b[0m \u001b[43m    \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     65\u001b[0m \u001b[43m    \u001b[49m\u001b[43mif_sheet_exists\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mif_sheet_exists\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     66\u001b[0m \u001b[43m    \u001b[49m\u001b[43mengine_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mengine_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m     67\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     69\u001b[0m \u001b[38;5;66;03m# ExcelWriter replaced \"a\" by \"r+\" to allow us to first read the excel file from\u001b[39;00m\n\u001b[0;32m     70\u001b[0m \u001b[38;5;66;03m# the file and later write to it\u001b[39;00m\n\u001b[0;32m     71\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mr+\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_mode:  \u001b[38;5;66;03m# Load from existing workbook\u001b[39;00m\n",
      "File \u001b[1;32mC:\\Program Files\\MySoft\\anaconda3\\Anaconda3_20240312\\envs\\dataMining_insuranceAntiFraud\\lib\\site-packages\\pandas\\io\\excel\\_base.py:1246\u001b[0m, in \u001b[0;36mExcelWriter.__init__\u001b[1;34m(self, path, engine, date_format, datetime_format, mode, storage_options, if_sheet_exists, engine_kwargs)\u001b[0m\n\u001b[0;32m   1242\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_handles \u001b[38;5;241m=\u001b[39m IOHandles(\n\u001b[0;32m   1243\u001b[0m     cast(IO[\u001b[38;5;28mbytes\u001b[39m], path), compression\u001b[38;5;241m=\u001b[39m{\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcompression\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;28;01mNone\u001b[39;00m}\n\u001b[0;32m   1244\u001b[0m )\n\u001b[0;32m   1245\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(path, ExcelWriter):\n\u001b[1;32m-> 1246\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_handles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m   1247\u001b[0m \u001b[43m        \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\n\u001b[0;32m   1248\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m   1249\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_cur_sheet \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m   1251\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m date_format \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
      "File \u001b[1;32mC:\\Program Files\\MySoft\\anaconda3\\Anaconda3_20240312\\envs\\dataMining_insuranceAntiFraud\\lib\\site-packages\\pandas\\io\\common.py:882\u001b[0m, in \u001b[0;36mget_handle\u001b[1;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[0;32m    873\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[0;32m    874\u001b[0m             handle,\n\u001b[0;32m    875\u001b[0m             ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    878\u001b[0m             newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m    879\u001b[0m         )\n\u001b[0;32m    880\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m    881\u001b[0m         \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m--> 882\u001b[0m         handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m    883\u001b[0m     handles\u001b[38;5;241m.\u001b[39mappend(handle)\n\u001b[0;32m    885\u001b[0m \u001b[38;5;66;03m# Convert BytesIO or file objects passed with an encoding\u001b[39;00m\n",
      "\u001b[1;31mPermissionError\u001b[0m: [Errno 13] Permission denied: 'D:\\\\wk\\\\myProject\\\\project_insureFraudPredict\\\\data\\\\overview.xlsx'"
     ]
    }
   ],
   "source": [
    "# 1. 数据预览，空值，均值，标准差，重复值\n",
    "\n",
    "## 空值：authorities_contacted 60个空值\n",
    "## \n",
    "def overViewAnalysis(dataframe):\n",
    "    print('----------------整体概况----------------')\n",
    "    overview = pd.DataFrame()\n",
    "    overview['type'] = dataframe.dtypes\n",
    "    overview['row_nums'] = dataframe.shape[0]\n",
    "    overview['null_nums'] = dataframe.isnull().sum()\n",
    "    # overview['min_num'] = dataframe.min()\n",
    "#     overview['max_num'] = dataframe.max()\n",
    "    overview['mean_num'] = dataframe.describe().loc['mean']\n",
    "    overview['std_num'] = dataframe.describe().loc['std']\n",
    "\n",
    "    for col in dataframe.columns:\n",
    "        overview.loc[col,'nunique_nums'] = dataframe[col].nunique()\n",
    "\n",
    "    print(overview)\n",
    "    print('----------------整体概况----------------')\n",
    "    return overview\n",
    "    \n",
    "overView = overViewAnalysis(train)\n",
    "overView.to_excel(f\"{base_dir}\\\\overview.xlsx\", engine='openpyxl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2. authorities_contacted 空值填充为 no_contacted\n",
    "train = train.fillna({'authorities_contacted': 'no_contacted'})\n",
    "train['authorities_contacted'].isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3. 对日期进行编码\n",
    "def date_to_timestamp_days(date_series):\n",
    "    \"\"\"\n",
    "    将日期序列转换为以天为单位的时间戳（自1970年1月1日起）。\n",
    "\n",
    "    参数:\n",
    "    date_series (pd.Series): 包含日期的Pandas Series对象。\n",
    "\n",
    "    返回:\n",
    "    pd.Series: 包含时间戳（以天为单位）的Pandas Series对象。\n",
    "    \"\"\"\n",
    "    \n",
    "    # 确保输入是datetime类型\n",
    "\n",
    "    if date_series.dtype == 'object':\n",
    "        try:\n",
    "            date_series_convert = pd.to_datetime(date_series, format='%Y-%m-%d')\n",
    "        except Exception as e:\n",
    "            print(f\"转换为datetime类型时出错: {e}\")\n",
    "    else:\n",
    "        print(\"'已经是非object类型，无需转换。\")\n",
    "\n",
    "    if not pd.api.types.is_datetime64_any_dtype(date_series_convert):\n",
    "        raise ValueError(\"输入的Series必须是datetime类型\")\n",
    "\n",
    "    # 计算时间戳（以天为单位）\n",
    "    timestamp_days = (date_series_convert - pd.Timestamp(\"1970-01-01\")).dt.days\n",
    "\n",
    "    return timestamp_days\n",
    "\n",
    "train['incident_date_timestamp_days'] = date_to_timestamp_days(train['incident_date']) \n",
    "train['policy_bind_date_timestamp_days'] = date_to_timestamp_days(train['policy_bind_date']) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>policy_id</th>\n",
       "      <th>age</th>\n",
       "      <th>customer_months</th>\n",
       "      <th>policy_bind_date</th>\n",
       "      <th>policy_state</th>\n",
       "      <th>policy_csl</th>\n",
       "      <th>policy_deductable</th>\n",
       "      <th>policy_annual_premium</th>\n",
       "      <th>umbrella_limit</th>\n",
       "      <th>insured_zip</th>\n",
       "      <th>insured_sex</th>\n",
       "      <th>insured_education_level</th>\n",
       "      <th>insured_occupation</th>\n",
       "      <th>insured_hobbies</th>\n",
       "      <th>insured_relationship</th>\n",
       "      <th>capital-gains</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>incident_date</th>\n",
       "      <th>incident_type</th>\n",
       "      <th>collision_type</th>\n",
       "      <th>incident_severity</th>\n",
       "      <th>authorities_contacted</th>\n",
       "      <th>incident_state</th>\n",
       "      <th>incident_city</th>\n",
       "      <th>incident_hour_of_the_day</th>\n",
       "      <th>number_of_vehicles_involved</th>\n",
       "      <th>property_damage</th>\n",
       "      <th>bodily_injuries</th>\n",
       "      <th>witnesses</th>\n",
       "      <th>police_report_available</th>\n",
       "      <th>total_claim_amount</th>\n",
       "      <th>injury_claim</th>\n",
       "      <th>property_claim</th>\n",
       "      <th>vehicle_claim</th>\n",
       "      <th>auto_make</th>\n",
       "      <th>auto_model</th>\n",
       "      <th>auto_year</th>\n",
       "      <th>fraud</th>\n",
       "      <th>incident_date_timestamp_days</th>\n",
       "      <th>policy_bind_date_timestamp_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>122576</td>\n",
       "      <td>37</td>\n",
       "      <td>189</td>\n",
       "      <td>629</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1000</td>\n",
       "      <td>1466</td>\n",
       "      <td>5000000</td>\n",
       "      <td>455456</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>15</td>\n",
       "      <td>1</td>\n",
       "      <td>62203</td>\n",
       "      <td>0</td>\n",
       "      <td>15</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>21</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>54930</td>\n",
       "      <td>6029</td>\n",
       "      <td>5752</td>\n",
       "      <td>44452</td>\n",
       "      <td>9</td>\n",
       "      <td>26</td>\n",
       "      <td>2000</td>\n",
       "      <td>0</td>\n",
       "      <td>16426</td>\n",
       "      <td>15938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>937713</td>\n",
       "      <td>44</td>\n",
       "      <td>234</td>\n",
       "      <td>233</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>500</td>\n",
       "      <td>821</td>\n",
       "      <td>0</td>\n",
       "      <td>591805</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "      <td>2</td>\n",
       "      <td>31606</td>\n",
       "      <td>0</td>\n",
       "      <td>73</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>50680</td>\n",
       "      <td>5376</td>\n",
       "      <td>10156</td>\n",
       "      <td>37347</td>\n",
       "      <td>6</td>\n",
       "      <td>10</td>\n",
       "      <td>1996</td>\n",
       "      <td>0</td>\n",
       "      <td>16484</td>\n",
       "      <td>10230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>680237</td>\n",
       "      <td>33</td>\n",
       "      <td>23</td>\n",
       "      <td>175</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1000</td>\n",
       "      <td>1844</td>\n",
       "      <td>0</td>\n",
       "      <td>442490</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>16</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>-43166</td>\n",
       "      <td>42</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>47829</td>\n",
       "      <td>4460</td>\n",
       "      <td>9247</td>\n",
       "      <td>33644</td>\n",
       "      <td>7</td>\n",
       "      <td>36</td>\n",
       "      <td>2002</td>\n",
       "      <td>0</td>\n",
       "      <td>16453</td>\n",
       "      <td>9532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>513080</td>\n",
       "      <td>42</td>\n",
       "      <td>210</td>\n",
       "      <td>516</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>500</td>\n",
       "      <td>1867</td>\n",
       "      <td>0</td>\n",
       "      <td>439408</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>13</td>\n",
       "      <td>18</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>-49440</td>\n",
       "      <td>57</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>68862</td>\n",
       "      <td>11043</td>\n",
       "      <td>5955</td>\n",
       "      <td>53548</td>\n",
       "      <td>11</td>\n",
       "      <td>21</td>\n",
       "      <td>2003</td>\n",
       "      <td>1</td>\n",
       "      <td>16468</td>\n",
       "      <td>14197</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>192875</td>\n",
       "      <td>29</td>\n",
       "      <td>81</td>\n",
       "      <td>334</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1000</td>\n",
       "      <td>816</td>\n",
       "      <td>0</td>\n",
       "      <td>640575</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>18</td>\n",
       "      <td>3</td>\n",
       "      <td>75296</td>\n",
       "      <td>-73689</td>\n",
       "      <td>64</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>9</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>59726</td>\n",
       "      <td>5617</td>\n",
       "      <td>10301</td>\n",
       "      <td>41550</td>\n",
       "      <td>5</td>\n",
       "      <td>14</td>\n",
       "      <td>2004</td>\n",
       "      <td>0</td>\n",
       "      <td>16475</td>\n",
       "      <td>11695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>695</th>\n",
       "      <td>1008425</td>\n",
       "      <td>37</td>\n",
       "      <td>196</td>\n",
       "      <td>216</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>500</td>\n",
       "      <td>1301</td>\n",
       "      <td>0</td>\n",
       "      <td>474615</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>12</td>\n",
       "      <td>18</td>\n",
       "      <td>5</td>\n",
       "      <td>47627</td>\n",
       "      <td>0</td>\n",
       "      <td>42</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>61433</td>\n",
       "      <td>10436</td>\n",
       "      <td>11432</td>\n",
       "      <td>39745</td>\n",
       "      <td>9</td>\n",
       "      <td>29</td>\n",
       "      <td>2011</td>\n",
       "      <td>1</td>\n",
       "      <td>16453</td>\n",
       "      <td>10041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>696</th>\n",
       "      <td>770702</td>\n",
       "      <td>43</td>\n",
       "      <td>229</td>\n",
       "      <td>312</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>500</td>\n",
       "      <td>1435</td>\n",
       "      <td>8000000</td>\n",
       "      <td>444476</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>-32289</td>\n",
       "      <td>37</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>68623</td>\n",
       "      <td>6798</td>\n",
       "      <td>14557</td>\n",
       "      <td>50606</td>\n",
       "      <td>13</td>\n",
       "      <td>28</td>\n",
       "      <td>2013</td>\n",
       "      <td>1</td>\n",
       "      <td>16448</td>\n",
       "      <td>11471</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>697</th>\n",
       "      <td>755099</td>\n",
       "      <td>35</td>\n",
       "      <td>209</td>\n",
       "      <td>359</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>500</td>\n",
       "      <td>1639</td>\n",
       "      <td>0</td>\n",
       "      <td>639608</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>-40797</td>\n",
       "      <td>88</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>58033</td>\n",
       "      <td>9129</td>\n",
       "      <td>4598</td>\n",
       "      <td>40740</td>\n",
       "      <td>8</td>\n",
       "      <td>7</td>\n",
       "      <td>2002</td>\n",
       "      <td>0</td>\n",
       "      <td>16499</td>\n",
       "      <td>12063</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>698</th>\n",
       "      <td>693804</td>\n",
       "      <td>44</td>\n",
       "      <td>275</td>\n",
       "      <td>376</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2000</td>\n",
       "      <td>1042</td>\n",
       "      <td>0</td>\n",
       "      <td>432061</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>46822</td>\n",
       "      <td>0</td>\n",
       "      <td>33</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>35253</td>\n",
       "      <td>7359</td>\n",
       "      <td>3464</td>\n",
       "      <td>24677</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2007</td>\n",
       "      <td>1</td>\n",
       "      <td>16444</td>\n",
       "      <td>12255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>699</th>\n",
       "      <td>598086</td>\n",
       "      <td>47</td>\n",
       "      <td>263</td>\n",
       "      <td>189</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>500</td>\n",
       "      <td>1283</td>\n",
       "      <td>0</td>\n",
       "      <td>433809</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>17</td>\n",
       "      <td>5</td>\n",
       "      <td>54087</td>\n",
       "      <td>-61343</td>\n",
       "      <td>32</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>24320</td>\n",
       "      <td>2250</td>\n",
       "      <td>4285</td>\n",
       "      <td>18092</td>\n",
       "      <td>11</td>\n",
       "      <td>15</td>\n",
       "      <td>2008</td>\n",
       "      <td>0</td>\n",
       "      <td>16443</td>\n",
       "      <td>9723</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>700 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     policy_id  age  customer_months  policy_bind_date  policy_state  policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  insured_zip  insured_sex  insured_education_level  insured_occupation  insured_hobbies  insured_relationship  capital-gains  capital-loss  incident_date  incident_type  collision_type  incident_severity  authorities_contacted  incident_state  incident_city  incident_hour_of_the_day  number_of_vehicles_involved  property_damage  bodily_injuries  witnesses  police_report_available  total_claim_amount  injury_claim  property_claim  vehicle_claim  auto_make  auto_model  auto_year  fraud  incident_date_timestamp_days  policy_bind_date_timestamp_days\n",
       "0       122576   37              189               629             2           2               1000                   1466         5000000       455456            0                        5                  10               15                     1          62203             0             15              2               3                  2                      0               4              5                        21                            1                0                0          3                        0               54930          6029            5752          44452          9          26       2000      0                         16426                            15938\n",
       "1       937713   44              234               233             1           1                500                    821               0       591805            1                        3                   2               14                     2          31606             0             73              0               3                  1                      2               4              6                         4                            3                0                2          1                        2               50680          5376           10156          37347          6          10       1996      0                         16484                            10230\n",
       "2       680237   33               23               175             1           2               1000                   1844               0       442490            0                        2                   6               16                     5              0        -43166             42              2               3                  2                      3               2              3                         0                            1                0                2          1                        1               47829          4460            9247          33644          7          36       2002      0                         16453                             9532\n",
       "3       513080   42              210               516             0           2                500                   1867               0       439408            1                        3                  13               18                     3              0        -49440             57              0               1                  0                      1               2              3                        20                            3                2                2          2                        2               68862         11043            5955          53548         11          21       2003      1                         16468                            14197\n",
       "4       192875   29               81               334             0           0               1000                    816               0       640575            0                        4                   2               18                     3          75296        -73689             64              0               2                  2                      1               1              3                         9                            3                2                2          1                        2               59726          5617           10301          41550          5          14       2004      0                         16475                            11695\n",
       "..         ...  ...              ...               ...           ...         ...                ...                    ...             ...          ...          ...                      ...                 ...              ...                   ...            ...           ...            ...            ...             ...                ...                    ...             ...            ...                       ...                          ...              ...              ...        ...                      ...                 ...           ...             ...            ...        ...         ...        ...    ...                           ...                              ...\n",
       "695    1008425   37              196               216             2           1                500                   1301               0       474615            1                        3                  12               18                     5          47627             0             42              2               1                  0                      0               4              1                         4                            1                0                0          3                        1               61433         10436           11432          39745          9          29       2011      1                         16453                            10041\n",
       "696     770702   43              229               312             0           1                500                   1435         8000000       444476            1                        1                   6                9                     0              0        -32289             37              0               2                  0                      0               0              0                        17                            3                1                0          1                        0               68623          6798           14557          50606         13          28       2013      1                         16448                            11471\n",
       "697     755099   35              209               359             2           0                500                   1639               0       639608            0                        1                  13                9                     1              0        -40797             88              0               2                  1                      1               1              5                         7                            3                1                2          0                        2               58033          9129            4598          40740          8           7       2002      0                         16499                            12063\n",
       "698     693804   44              275               376             1           2               2000                   1042               0       432061            0                        0                   6               13                     2          46822             0             33              0               2                  0                      0               4              3                        20                            3                0                1          0                        1               35253          7359            3464          24677          1           4       2007      1                         16444                            12255\n",
       "699     598086   47              263               189             2           2                500                   1283               0       433809            0                        2                   6               17                     5          54087        -61343             32              0               3                  2                      3               3              2                         5                            3                0                0          0                        1               24320          2250            4285          18092         11          15       2008      0                         16443                             9723\n",
       "\n",
       "[700 rows x 40 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 3. 对object进行编码\n",
    "\n",
    "# 3.1 标签编码\n",
    "object_feature = list(train.select_dtypes(include=['object']).columns)\n",
    "lb = LabelEncoder()\n",
    "for col in object_feature:\n",
    "    train[col] = lb.fit_transform(train[col])\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3.2 独热编码，注意两点，一是避免重复使用同一encoder实例fit，二是OneHotEncoder需要二维数组作为输入，因此我们需要对单个列进行reshape\n",
    "\n",
    "# object_feature = list(train.select_dtypes(include=['object']).columns)\n",
    "\n",
    "# for col in object_feature:\n",
    "#     # 正确的单独处理示例（避免重复使用同一encoder实例fit）：\n",
    "#     onehot = OneHotEncoder(sparse_output=False, drop='first')\n",
    "#     if col not in ['incident_date','policy_bind_date']:\n",
    "#         # 注意：OneHotEncoder需要二维数组作为输入，因此我们需要对单个列进行reshape\n",
    "#         train[col] = onehot.fit_transform(train[[col]])\n",
    "# train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>customer_months</th>\n",
       "      <th>policy_state</th>\n",
       "      <th>policy_csl</th>\n",
       "      <th>policy_deductable</th>\n",
       "      <th>policy_annual_premium</th>\n",
       "      <th>umbrella_limit</th>\n",
       "      <th>insured_zip</th>\n",
       "      <th>insured_sex</th>\n",
       "      <th>insured_education_level</th>\n",
       "      <th>insured_occupation</th>\n",
       "      <th>insured_hobbies</th>\n",
       "      <th>insured_relationship</th>\n",
       "      <th>capital-gains</th>\n",
       "      <th>capital-loss</th>\n",
       "      <th>incident_type</th>\n",
       "      <th>collision_type</th>\n",
       "      <th>incident_severity</th>\n",
       "      <th>authorities_contacted</th>\n",
       "      <th>incident_state</th>\n",
       "      <th>incident_city</th>\n",
       "      <th>incident_hour_of_the_day</th>\n",
       "      <th>number_of_vehicles_involved</th>\n",
       "      <th>property_damage</th>\n",
       "      <th>bodily_injuries</th>\n",
       "      <th>witnesses</th>\n",
       "      <th>police_report_available</th>\n",
       "      <th>total_claim_amount</th>\n",
       "      <th>injury_claim</th>\n",
       "      <th>property_claim</th>\n",
       "      <th>vehicle_claim</th>\n",
       "      <th>auto_make</th>\n",
       "      <th>auto_model</th>\n",
       "      <th>auto_year</th>\n",
       "      <th>fraud</th>\n",
       "      <th>incident_date_timestamp_days</th>\n",
       "      <th>policy_bind_date_timestamp_days</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>37</td>\n",
       "      <td>189</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1000</td>\n",
       "      <td>1466</td>\n",
       "      <td>5000000</td>\n",
       "      <td>455456</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>10</td>\n",
       "      <td>15</td>\n",
       "      <td>1</td>\n",
       "      <td>62203</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>21</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>54930</td>\n",
       "      <td>6029</td>\n",
       "      <td>5752</td>\n",
       "      <td>44452</td>\n",
       "      <td>9</td>\n",
       "      <td>26</td>\n",
       "      <td>2000</td>\n",
       "      <td>0</td>\n",
       "      <td>16426</td>\n",
       "      <td>15938</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>44</td>\n",
       "      <td>234</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>500</td>\n",
       "      <td>821</td>\n",
       "      <td>0</td>\n",
       "      <td>591805</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>14</td>\n",
       "      <td>2</td>\n",
       "      <td>31606</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>50680</td>\n",
       "      <td>5376</td>\n",
       "      <td>10156</td>\n",
       "      <td>37347</td>\n",
       "      <td>6</td>\n",
       "      <td>10</td>\n",
       "      <td>1996</td>\n",
       "      <td>0</td>\n",
       "      <td>16484</td>\n",
       "      <td>10230</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>33</td>\n",
       "      <td>23</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1000</td>\n",
       "      <td>1844</td>\n",
       "      <td>0</td>\n",
       "      <td>442490</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>16</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>-43166</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>47829</td>\n",
       "      <td>4460</td>\n",
       "      <td>9247</td>\n",
       "      <td>33644</td>\n",
       "      <td>7</td>\n",
       "      <td>36</td>\n",
       "      <td>2002</td>\n",
       "      <td>0</td>\n",
       "      <td>16453</td>\n",
       "      <td>9532</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>42</td>\n",
       "      <td>210</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>500</td>\n",
       "      <td>1867</td>\n",
       "      <td>0</td>\n",
       "      <td>439408</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>13</td>\n",
       "      <td>18</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>-49440</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>68862</td>\n",
       "      <td>11043</td>\n",
       "      <td>5955</td>\n",
       "      <td>53548</td>\n",
       "      <td>11</td>\n",
       "      <td>21</td>\n",
       "      <td>2003</td>\n",
       "      <td>1</td>\n",
       "      <td>16468</td>\n",
       "      <td>14197</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>29</td>\n",
       "      <td>81</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1000</td>\n",
       "      <td>816</td>\n",
       "      <td>0</td>\n",
       "      <td>640575</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>18</td>\n",
       "      <td>3</td>\n",
       "      <td>75296</td>\n",
       "      <td>-73689</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>9</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>59726</td>\n",
       "      <td>5617</td>\n",
       "      <td>10301</td>\n",
       "      <td>41550</td>\n",
       "      <td>5</td>\n",
       "      <td>14</td>\n",
       "      <td>2004</td>\n",
       "      <td>0</td>\n",
       "      <td>16475</td>\n",
       "      <td>11695</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>695</th>\n",
       "      <td>37</td>\n",
       "      <td>196</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>500</td>\n",
       "      <td>1301</td>\n",
       "      <td>0</td>\n",
       "      <td>474615</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>12</td>\n",
       "      <td>18</td>\n",
       "      <td>5</td>\n",
       "      <td>47627</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>61433</td>\n",
       "      <td>10436</td>\n",
       "      <td>11432</td>\n",
       "      <td>39745</td>\n",
       "      <td>9</td>\n",
       "      <td>29</td>\n",
       "      <td>2011</td>\n",
       "      <td>1</td>\n",
       "      <td>16453</td>\n",
       "      <td>10041</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>696</th>\n",
       "      <td>43</td>\n",
       "      <td>229</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>500</td>\n",
       "      <td>1435</td>\n",
       "      <td>8000000</td>\n",
       "      <td>444476</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>-32289</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>17</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>68623</td>\n",
       "      <td>6798</td>\n",
       "      <td>14557</td>\n",
       "      <td>50606</td>\n",
       "      <td>13</td>\n",
       "      <td>28</td>\n",
       "      <td>2013</td>\n",
       "      <td>1</td>\n",
       "      <td>16448</td>\n",
       "      <td>11471</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>697</th>\n",
       "      <td>35</td>\n",
       "      <td>209</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>500</td>\n",
       "      <td>1639</td>\n",
       "      <td>0</td>\n",
       "      <td>639608</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>13</td>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>-40797</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>7</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>58033</td>\n",
       "      <td>9129</td>\n",
       "      <td>4598</td>\n",
       "      <td>40740</td>\n",
       "      <td>8</td>\n",
       "      <td>7</td>\n",
       "      <td>2002</td>\n",
       "      <td>0</td>\n",
       "      <td>16499</td>\n",
       "      <td>12063</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>698</th>\n",
       "      <td>44</td>\n",
       "      <td>275</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2000</td>\n",
       "      <td>1042</td>\n",
       "      <td>0</td>\n",
       "      <td>432061</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>46822</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>20</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>35253</td>\n",
       "      <td>7359</td>\n",
       "      <td>3464</td>\n",
       "      <td>24677</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2007</td>\n",
       "      <td>1</td>\n",
       "      <td>16444</td>\n",
       "      <td>12255</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>699</th>\n",
       "      <td>47</td>\n",
       "      <td>263</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>500</td>\n",
       "      <td>1283</td>\n",
       "      <td>0</td>\n",
       "      <td>433809</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "      <td>17</td>\n",
       "      <td>5</td>\n",
       "      <td>54087</td>\n",
       "      <td>-61343</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>24320</td>\n",
       "      <td>2250</td>\n",
       "      <td>4285</td>\n",
       "      <td>18092</td>\n",
       "      <td>11</td>\n",
       "      <td>15</td>\n",
       "      <td>2008</td>\n",
       "      <td>0</td>\n",
       "      <td>16443</td>\n",
       "      <td>9723</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>700 rows × 37 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     age  customer_months  policy_state  policy_csl  policy_deductable  policy_annual_premium  umbrella_limit  insured_zip  insured_sex  insured_education_level  insured_occupation  insured_hobbies  insured_relationship  capital-gains  capital-loss  incident_type  collision_type  incident_severity  authorities_contacted  incident_state  incident_city  incident_hour_of_the_day  number_of_vehicles_involved  property_damage  bodily_injuries  witnesses  police_report_available  total_claim_amount  injury_claim  property_claim  vehicle_claim  auto_make  auto_model  auto_year  fraud  incident_date_timestamp_days  policy_bind_date_timestamp_days\n",
       "0     37              189             2           2               1000                   1466         5000000       455456            0                        5                  10               15                     1          62203             0              2               3                  2                      0               4              5                        21                            1                0                0          3                        0               54930          6029            5752          44452          9          26       2000      0                         16426                            15938\n",
       "1     44              234             1           1                500                    821               0       591805            1                        3                   2               14                     2          31606             0              0               3                  1                      2               4              6                         4                            3                0                2          1                        2               50680          5376           10156          37347          6          10       1996      0                         16484                            10230\n",
       "2     33               23             1           2               1000                   1844               0       442490            0                        2                   6               16                     5              0        -43166              2               3                  2                      3               2              3                         0                            1                0                2          1                        1               47829          4460            9247          33644          7          36       2002      0                         16453                             9532\n",
       "3     42              210             0           2                500                   1867               0       439408            1                        3                  13               18                     3              0        -49440              0               1                  0                      1               2              3                        20                            3                2                2          2                        2               68862         11043            5955          53548         11          21       2003      1                         16468                            14197\n",
       "4     29               81             0           0               1000                    816               0       640575            0                        4                   2               18                     3          75296        -73689              0               2                  2                      1               1              3                         9                            3                2                2          1                        2               59726          5617           10301          41550          5          14       2004      0                         16475                            11695\n",
       "..   ...              ...           ...         ...                ...                    ...             ...          ...          ...                      ...                 ...              ...                   ...            ...           ...            ...             ...                ...                    ...             ...            ...                       ...                          ...              ...              ...        ...                      ...                 ...           ...             ...            ...        ...         ...        ...    ...                           ...                              ...\n",
       "695   37              196             2           1                500                   1301               0       474615            1                        3                  12               18                     5          47627             0              2               1                  0                      0               4              1                         4                            1                0                0          3                        1               61433         10436           11432          39745          9          29       2011      1                         16453                            10041\n",
       "696   43              229             0           1                500                   1435         8000000       444476            1                        1                   6                9                     0              0        -32289              0               2                  0                      0               0              0                        17                            3                1                0          1                        0               68623          6798           14557          50606         13          28       2013      1                         16448                            11471\n",
       "697   35              209             2           0                500                   1639               0       639608            0                        1                  13                9                     1              0        -40797              0               2                  1                      1               1              5                         7                            3                1                2          0                        2               58033          9129            4598          40740          8           7       2002      0                         16499                            12063\n",
       "698   44              275             1           2               2000                   1042               0       432061            0                        0                   6               13                     2          46822             0              0               2                  0                      0               4              3                        20                            3                0                1          0                        1               35253          7359            3464          24677          1           4       2007      1                         16444                            12255\n",
       "699   47              263             2           2                500                   1283               0       433809            0                        2                   6               17                     5          54087        -61343              0               3                  2                      3               3              2                         5                            3                0                0          0                        1               24320          2250            4285          18092         11          15       2008      0                         16443                             9723\n",
       "\n",
       "[700 rows x 37 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 4. 数据清洗，删除冗余数据\n",
    "for col in ['policy_id','incident_date','policy_bind_date']: # ,\n",
    "    del train[col]\n",
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------------整体概况----------------\n",
      "                                    type  row_nums  null_nums  mean_num  std_num  nunique_nums\n",
      "age                                int64       700          0        39        9            45\n",
      "customer_months                    int64       700          0       205      116           332\n",
      "policy_state                       int32       700          0         1        1             3\n",
      "policy_csl                         int32       700          0         1        1             3\n",
      "policy_deductable                  int64       700          0      1148      612             3\n",
      "policy_annual_premium            float64       700          0      1247      251           698\n",
      "umbrella_limit                     int64       700          0   1100000  2282922            11\n",
      "insured_zip                        int64       700          0    502797    74251           699\n",
      "insured_sex                        int32       700          0         0        0             2\n",
      "insured_education_level            int32       700          0         3        2             7\n",
      "insured_occupation                 int32       700          0         7        4            14\n",
      "insured_hobbies                    int32       700          0        10        6            20\n",
      "insured_relationship               int32       700          0         2        2             6\n",
      "capital-gains                      int64       700          0     25842    28108           352\n",
      "capital-loss                       int64       700          0    -26247    28465           357\n",
      "incident_type                      int32       700          0         1        1             4\n",
      "collision_type                     int32       700          0         2        1             4\n",
      "incident_severity                  int32       700          0         1        1             4\n",
      "authorities_contacted              int32       700          0         2        1             5\n",
      "incident_state                     int32       700          0         2        2             7\n",
      "incident_city                      int32       700          0         3        2             7\n",
      "incident_hour_of_the_day           int64       700          0        12        7            24\n",
      "number_of_vehicles_involved        int64       700          0         2        1             4\n",
      "property_damage                    int32       700          0         1        1             3\n",
      "bodily_injuries                    int64       700          0         1        1             3\n",
      "witnesses                          int64       700          0         1        1             4\n",
      "police_report_available            int32       700          0         1        1             3\n",
      "total_claim_amount                 int64       700          0     52423    26179           694\n",
      "injury_claim                       int64       700          0      7450     4889           665\n",
      "property_claim                     int64       700          0      7332     4787           662\n",
      "vehicle_claim                      int64       700          0     37688    18724           697\n",
      "auto_make                          int32       700          0         7        4            14\n",
      "auto_model                         int32       700          0        19       11            39\n",
      "auto_year                          int64       700          0      2005        6            21\n",
      "incident_date_timestamp_days       int64       700          0     16465       25           112\n",
      "policy_bind_date_timestamp_days    int64       700          0     11777     2670           674\n",
      "----------------整体概况----------------\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>type</th>\n",
       "      <th>row_nums</th>\n",
       "      <th>null_nums</th>\n",
       "      <th>mean_num</th>\n",
       "      <th>std_num</th>\n",
       "      <th>nunique_nums</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>39</td>\n",
       "      <td>9</td>\n",
       "      <td>45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>customer_months</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>205</td>\n",
       "      <td>116</td>\n",
       "      <td>332</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_state</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_csl</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_deductable</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1148</td>\n",
       "      <td>612</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_annual_premium</th>\n",
       "      <td>float64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1247</td>\n",
       "      <td>251</td>\n",
       "      <td>698</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>umbrella_limit</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1100000</td>\n",
       "      <td>2282922</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_zip</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>502797</td>\n",
       "      <td>74251</td>\n",
       "      <td>699</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_sex</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_education_level</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_occupation</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_hobbies</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>6</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_relationship</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>capital-gains</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>25842</td>\n",
       "      <td>28108</td>\n",
       "      <td>352</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>capital-loss</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>-26247</td>\n",
       "      <td>28465</td>\n",
       "      <td>357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_type</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>collision_type</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_severity</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>authorities_contacted</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_state</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_city</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_hour_of_the_day</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>7</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>number_of_vehicles_involved</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>property_damage</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bodily_injuries</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>witnesses</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>police_report_available</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>total_claim_amount</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>52423</td>\n",
       "      <td>26179</td>\n",
       "      <td>694</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>injury_claim</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>7450</td>\n",
       "      <td>4889</td>\n",
       "      <td>665</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>property_claim</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>7332</td>\n",
       "      <td>4787</td>\n",
       "      <td>662</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vehicle_claim</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>37688</td>\n",
       "      <td>18724</td>\n",
       "      <td>697</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>auto_make</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>auto_model</th>\n",
       "      <td>int32</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>19</td>\n",
       "      <td>11</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>auto_year</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>2005</td>\n",
       "      <td>6</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_date_timestamp_days</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>16465</td>\n",
       "      <td>25</td>\n",
       "      <td>112</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_bind_date_timestamp_days</th>\n",
       "      <td>int64</td>\n",
       "      <td>700</td>\n",
       "      <td>0</td>\n",
       "      <td>11777</td>\n",
       "      <td>2670</td>\n",
       "      <td>674</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    type  row_nums  null_nums  mean_num  std_num  nunique_nums\n",
       "age                                int64       700          0        39        9            45\n",
       "customer_months                    int64       700          0       205      116           332\n",
       "policy_state                       int32       700          0         1        1             3\n",
       "policy_csl                         int32       700          0         1        1             3\n",
       "policy_deductable                  int64       700          0      1148      612             3\n",
       "policy_annual_premium            float64       700          0      1247      251           698\n",
       "umbrella_limit                     int64       700          0   1100000  2282922            11\n",
       "insured_zip                        int64       700          0    502797    74251           699\n",
       "insured_sex                        int32       700          0         0        0             2\n",
       "insured_education_level            int32       700          0         3        2             7\n",
       "insured_occupation                 int32       700          0         7        4            14\n",
       "insured_hobbies                    int32       700          0        10        6            20\n",
       "insured_relationship               int32       700          0         2        2             6\n",
       "capital-gains                      int64       700          0     25842    28108           352\n",
       "capital-loss                       int64       700          0    -26247    28465           357\n",
       "incident_type                      int32       700          0         1        1             4\n",
       "collision_type                     int32       700          0         2        1             4\n",
       "incident_severity                  int32       700          0         1        1             4\n",
       "authorities_contacted              int32       700          0         2        1             5\n",
       "incident_state                     int32       700          0         2        2             7\n",
       "incident_city                      int32       700          0         3        2             7\n",
       "incident_hour_of_the_day           int64       700          0        12        7            24\n",
       "number_of_vehicles_involved        int64       700          0         2        1             4\n",
       "property_damage                    int32       700          0         1        1             3\n",
       "bodily_injuries                    int64       700          0         1        1             3\n",
       "witnesses                          int64       700          0         1        1             4\n",
       "police_report_available            int32       700          0         1        1             3\n",
       "total_claim_amount                 int64       700          0     52423    26179           694\n",
       "injury_claim                       int64       700          0      7450     4889           665\n",
       "property_claim                     int64       700          0      7332     4787           662\n",
       "vehicle_claim                      int64       700          0     37688    18724           697\n",
       "auto_make                          int32       700          0         7        4            14\n",
       "auto_model                         int32       700          0        19       11            39\n",
       "auto_year                          int64       700          0      2005        6            21\n",
       "incident_date_timestamp_days       int64       700          0     16465       25           112\n",
       "policy_bind_date_timestamp_days    int64       700          0     11777     2670           674"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 5. 模型训练\n",
    "## 5.1 训练集以及特征集的划分\n",
    "y = train['fraud']\n",
    "x = train.drop(columns=['fraud'])\n",
    "overViewAnalysis(x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 5.2 模型训练，模型参数设定\n",
    "\n",
    "## GBDT 模型\n",
    "GBDT_param = {\n",
    "    'loss': 'log_loss',\n",
    "    'learning_rate': 0.1,\n",
    "    'n_estimators': 30,\n",
    "    'max_depth': 3,\n",
    "    'min_samples_split': 300\n",
    "}\n",
    "\n",
    "GBDT_clf = GradientBoostingClassifier()\n",
    "\n",
    "# 决策树模型\n",
    "tree_param = {\n",
    "    'criterion': 'gini',\n",
    "    'max_depth': 30,\n",
    "    'min_impurity_decrease': 0.1,\n",
    "    'min_samples_leaf': 2\n",
    "\n",
    "}\n",
    "Tree_clf = DecisionTreeClassifier(**tree_param)  #\n",
    "\n",
    "# xgboost模型\n",
    "xgboost_param = {\n",
    "    'learning_rate': 0.01,\n",
    "    'reg_alpha': 0.,\n",
    "    'max_depth': 3,\n",
    "    'gamma': 0,\n",
    "    'min_child_weight': 1\n",
    "}\n",
    "\n",
    "xgboost_clf = xgboost.XGBClassifier(**xgboost_param)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "auc:0.782051282051282\n",
      "auc:0.8664529914529915\n",
      "auc:0.875534188034188\n",
      "auc:0.8327991452991452\n",
      "auc:0.8301282051282051\n",
      "auc:0.8226495726495726\n",
      "auc:0.9326923076923077\n",
      "auc:0.8028846153846154\n",
      "auc:0.8498931623931624\n",
      "auc:0.8581011351909185\n"
     ]
    }
   ],
   "source": [
    "## 5.3 模型训练过程中的交叉验证集划分 KFold\n",
    "sk = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)\n",
    "for train_, test_ in sk.split(x, y):\n",
    "    # 注意是x.iloc[train_]，不是x[train_]\n",
    "    # X_train训练集，X_test验证集，y_train就是预测值的训练集，y_test就是预测值的验证集\n",
    "    X_train, X_test = x.iloc[train_], x.iloc[test_]\n",
    "    y_train, y_test = y.iloc[train_], y.iloc[test_]\n",
    "    \n",
    "    \n",
    "    ## 5.4 模型训练\n",
    "    \n",
    "    xgboost_clf.fit(X_train, y_train)\n",
    "    \n",
    "    ## 5.5 模型评估\n",
    "    pred = xgboost_clf.predict_proba(X_test)[:,-1]\n",
    "    \n",
    "    print('auc:{}'.format(roc_auc_score(y_test, pred)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "----------------整体概况----------------\n",
      "                                    type  row_nums  null_nums  mean_num  std_num  nunique_nums\n",
      "age                                int64       300          0        39        9            41\n",
      "customer_months                    int64       300          0       199      113           202\n",
      "policy_state                       int32       300          0         1        1             3\n",
      "policy_csl                         int32       300          0         1        1             3\n",
      "policy_deductable                  int64       300          0      1108      613             3\n",
      "policy_annual_premium            float64       300          0      1277      237           300\n",
      "umbrella_limit                     int64       300          0   1103333  2334700             9\n",
      "insured_zip                        int64       300          0    500223    73345           300\n",
      "insured_sex                        int32       300          0         0        0             2\n",
      "insured_education_level            int32       300          0         3        2             7\n",
      "insured_occupation                 int32       300          0         7        4            14\n",
      "insured_hobbies                    int32       300          0        10        6            20\n",
      "insured_relationship               int32       300          0         3        2             6\n",
      "capital-gains                      int64       300          0     23610    27518           140\n",
      "capital-loss                       int64       300          0    -28066    27294           169\n",
      "incident_type                      int32       300          0         1        1             4\n",
      "collision_type                     int32       300          0         2        1             4\n",
      "incident_severity                  int32       300          0         1        1             4\n",
      "authorities_contacted              int32       300          0         2        1             5\n",
      "incident_state                     int32       300          0         2        2             7\n",
      "incident_city                      int32       300          0         3        2             7\n",
      "incident_hour_of_the_day           int64       300          0        12        7            24\n",
      "number_of_vehicles_involved        int64       300          0         2        1             4\n",
      "property_damage                    int32       300          0         1        1             3\n",
      "bodily_injuries                    int64       300          0         1        1             3\n",
      "witnesses                          int64       300          0         2        1             4\n",
      "police_report_available            int32       300          0         1        1             3\n",
      "total_claim_amount                 int64       300          0     53400    27095           298\n",
      "injury_claim                       int64       300          0      7379     4889           293\n",
      "property_claim                     int64       300          0      7569     4928           289\n",
      "vehicle_claim                      int64       300          0     38480    19388           297\n",
      "auto_make                          int32       300          0         6        4            14\n",
      "auto_model                         int32       300          0        20       11            39\n",
      "auto_year                          int64       300          0      2006        6            21\n",
      "incident_date_timestamp_days       int64       300          0     16465       25            95\n",
      "policy_bind_date_timestamp_days    int64       300          0     11607     2729           299\n",
      "----------------整体概况----------------\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>type</th>\n",
       "      <th>row_nums</th>\n",
       "      <th>null_nums</th>\n",
       "      <th>mean_num</th>\n",
       "      <th>std_num</th>\n",
       "      <th>nunique_nums</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>age</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>39</td>\n",
       "      <td>9</td>\n",
       "      <td>41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>customer_months</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>199</td>\n",
       "      <td>113</td>\n",
       "      <td>202</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_state</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_csl</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_deductable</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1108</td>\n",
       "      <td>613</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_annual_premium</th>\n",
       "      <td>float64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1277</td>\n",
       "      <td>237</td>\n",
       "      <td>300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>umbrella_limit</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1103333</td>\n",
       "      <td>2334700</td>\n",
       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_zip</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>500223</td>\n",
       "      <td>73345</td>\n",
       "      <td>300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_sex</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_education_level</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_occupation</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>7</td>\n",
       "      <td>4</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_hobbies</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>10</td>\n",
       "      <td>6</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>insured_relationship</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>capital-gains</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>23610</td>\n",
       "      <td>27518</td>\n",
       "      <td>140</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>capital-loss</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>-28066</td>\n",
       "      <td>27294</td>\n",
       "      <td>169</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_type</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>collision_type</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_severity</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>authorities_contacted</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_state</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_city</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_hour_of_the_day</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>7</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>number_of_vehicles_involved</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>property_damage</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bodily_injuries</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>witnesses</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>police_report_available</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>total_claim_amount</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>53400</td>\n",
       "      <td>27095</td>\n",
       "      <td>298</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>injury_claim</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>7379</td>\n",
       "      <td>4889</td>\n",
       "      <td>293</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>property_claim</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>7569</td>\n",
       "      <td>4928</td>\n",
       "      <td>289</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vehicle_claim</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>38480</td>\n",
       "      <td>19388</td>\n",
       "      <td>297</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>auto_make</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>4</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>auto_model</th>\n",
       "      <td>int32</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>11</td>\n",
       "      <td>39</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>auto_year</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>2006</td>\n",
       "      <td>6</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>incident_date_timestamp_days</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>16465</td>\n",
       "      <td>25</td>\n",
       "      <td>95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>policy_bind_date_timestamp_days</th>\n",
       "      <td>int64</td>\n",
       "      <td>300</td>\n",
       "      <td>0</td>\n",
       "      <td>11607</td>\n",
       "      <td>2729</td>\n",
       "      <td>299</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                    type  row_nums  null_nums  mean_num  std_num  nunique_nums\n",
       "age                                int64       300          0        39        9            41\n",
       "customer_months                    int64       300          0       199      113           202\n",
       "policy_state                       int32       300          0         1        1             3\n",
       "policy_csl                         int32       300          0         1        1             3\n",
       "policy_deductable                  int64       300          0      1108      613             3\n",
       "policy_annual_premium            float64       300          0      1277      237           300\n",
       "umbrella_limit                     int64       300          0   1103333  2334700             9\n",
       "insured_zip                        int64       300          0    500223    73345           300\n",
       "insured_sex                        int32       300          0         0        0             2\n",
       "insured_education_level            int32       300          0         3        2             7\n",
       "insured_occupation                 int32       300          0         7        4            14\n",
       "insured_hobbies                    int32       300          0        10        6            20\n",
       "insured_relationship               int32       300          0         3        2             6\n",
       "capital-gains                      int64       300          0     23610    27518           140\n",
       "capital-loss                       int64       300          0    -28066    27294           169\n",
       "incident_type                      int32       300          0         1        1             4\n",
       "collision_type                     int32       300          0         2        1             4\n",
       "incident_severity                  int32       300          0         1        1             4\n",
       "authorities_contacted              int32       300          0         2        1             5\n",
       "incident_state                     int32       300          0         2        2             7\n",
       "incident_city                      int32       300          0         3        2             7\n",
       "incident_hour_of_the_day           int64       300          0        12        7            24\n",
       "number_of_vehicles_involved        int64       300          0         2        1             4\n",
       "property_damage                    int32       300          0         1        1             3\n",
       "bodily_injuries                    int64       300          0         1        1             3\n",
       "witnesses                          int64       300          0         2        1             4\n",
       "police_report_available            int32       300          0         1        1             3\n",
       "total_claim_amount                 int64       300          0     53400    27095           298\n",
       "injury_claim                       int64       300          0      7379     4889           293\n",
       "property_claim                     int64       300          0      7569     4928           289\n",
       "vehicle_claim                      int64       300          0     38480    19388           297\n",
       "auto_make                          int32       300          0         6        4            14\n",
       "auto_model                         int32       300          0        20       11            39\n",
       "auto_year                          int64       300          0      2006        6            21\n",
       "incident_date_timestamp_days       int64       300          0     16465       25            95\n",
       "policy_bind_date_timestamp_days    int64       300          0     11607     2729           299"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "## 5.6 模型预测\n",
    "\n",
    "### 5.6.1 概况预览\n",
    "need_pred = pd.read_csv(f\"{base_dir}\\\\test.csv\")\n",
    "# overViewAnalysis(need_pred)\n",
    "\n",
    "### 5.6.2 空值填充\n",
    "need_pred = need_pred.fillna({'authorities_contacted': 'no_contacted'})\n",
    "need_pred['authorities_contacted'].isnull().sum()\n",
    "\n",
    "### 5.6.3 特征编码\n",
    "need_pred['incident_date_timestamp_days'] = date_to_timestamp_days(need_pred['incident_date']) \n",
    "need_pred['policy_bind_date_timestamp_days'] = date_to_timestamp_days(need_pred['policy_bind_date']) \n",
    "\n",
    "object_feature = list(need_pred.select_dtypes(include=['object']).columns)\n",
    "\n",
    "# for col in object_feature:\n",
    "#     # 正确的单独处理示例（避免重复使用同一encoder实例fit）：\n",
    "#     onehot = OneHotEncoder(sparse_output=False, drop='first')\n",
    "#     if col not in ['incident_date','policy_bind_date']:\n",
    "#         # 注意：OneHotEncoder需要二维数组作为输入，因此我们需要对单个列进行reshape\n",
    "#         need_pred[col] = onehot.fit_transform(need_pred[[col]])\n",
    "\n",
    "\n",
    "\n",
    "for col in object_feature:\n",
    "    if col not in ['incident_date','policy_bind_date']:\n",
    "        # 注意：使用标签编码时,使用lb.transform,不用使用fit_transfomr再次进行拟合\n",
    "        need_pred[col] = lb.fit_transform(need_pred[col])\n",
    "        \n",
    "        \n",
    "### 5.6.4 删除冗余数据\n",
    "for col in ['policy_id','incident_date','policy_bind_date']: # ,\n",
    "    del need_pred[col]\n",
    "\n",
    "overViewAnalysis(need_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 5.6.5 模型预测\n",
    "need_pred_predvalue = xgboost_clf.predict_proba(need_pred)[:, -1]\n",
    "need_pred_predvalue\n",
    "# 将预测概率转换为0,1标签\n",
    "final_pred = np.where(need_pred_predvalue >= 0.5, 1, 0)\n",
    "\n",
    "### 5.6.6 结果输出\n",
    "sub_df = pd.read_csv(f'{base_dir}\\\\submission.csv')\n",
    "sub_df['fraud'] = final_pred\n",
    "sub_df.to_csv(f'{base_dir}\\\\base_line.csv',index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "dataMining_insuranceAntiFraud",
   "language": "python",
   "name": "datamining_insuranceantifraud"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
